Get GitHub R Repositories from GitHub API v3


In [1]:
import requests
import json
import time
import datetime
import gzip

from dateutil import parser

In [2]:
API_KEYS = []

class APIKey(object):
    def __init__(self, owner, key, until):
        self.owner = owner
        self.key = key
        self.until = until
        
for line in open('github-api.key').readlines():
    owner, key = line.split(' ')    
    API_KEYS.append(APIKey(owner=owner.strip(), key=key.strip(), until=datetime.datetime.now()))

In [3]:
URL = 'https://api.github.com/search/repositories?per_page=100&q=language:R fork:false pushed:{start}..{end}&page={page}'

INITIAL_DELTA = datetime.timedelta(days=7)
START_DATE = datetime.datetime(2015, 1, 1)
END_DATE = datetime.datetime(2015, 6, 1)

RATIO_MULTIPLIER = 0.75

OUTPUT = '../data/R-apiv3-{start}-{end}.tar.gz'.format(start=START_DATE.isoformat(), end=END_DATE.isoformat())

TIMEZONE_DELTA = 2 # +2h from GMT to us
parsetime = lambda x: parser.parse(x, ignoretz=True) + datetime.timedelta(hours=TIMEZONE_DELTA)

In [4]:
def make_headers():
    # Needs API_KEYS which is a list of APIKey objects
    old = API_KEYS[0]
    API_KEYS.sort(key=lambda k: k.until)
    new = API_KEYS[0]
    if old != new:
        print 'Using {}\'s API key'.format(new.owner)
        
    # Wait?
    delay = (API_KEYS[0].until - datetime.datetime.now()).total_seconds()
    if delay > 0:
        print 'zzzZZZzzz ({} seconds)'.format(delay + 1)
        time.sleep(delay + 1)
        
    return {
        'Authorization': 'token '+API_KEYS[0].key,
        'User-agent': API_KEYS[0].owner,
    }
    
def github_API(url):
    tries = 0
    while True:
        r = requests.get(url, headers=make_headers())
        if r.status_code == 200:
            break
        if r.status_code == 403 and r.headers['X-RateLimit-Remaining'] == '0':
            message_datetime = parsetime(r.headers['date'])
            limit_datetime = datetime.datetime.fromtimestamp(int(r.headers['x-ratelimit-reset']))
            delta = (limit_datetime - message_datetime)
            API_KEYS[0].until = datetime.datetime.now() + delta    
        else:
            tries += 1
            if tries >= 5:
                raise requests.exceptions.HTTPError(r)
    return r

In [5]:
data = []
current_start = START_DATE
current_delta = INITIAL_DELTA

while current_start < END_DATE:
    current_end = min(END_DATE, current_start + current_delta)
    r = github_API(URL.format(start=current_start.isoformat(), end=current_end.isoformat(), page=1))
    content = json.loads(r.content)
    print current_start.isoformat(), current_end.isoformat(), str(current_delta), 'with', content['total_count']
    if content['total_count'] <= 1000:
        pages = int(content['total_count']) / 100 + 1
        for item in content['items']:
            data.append(item)
            
        for page in range(2, pages + 1):
            r = github_API(URL.format(start=current_start.isoformat(), end=current_end.isoformat(), page=page))
            content = json.loads(r.content)
            for item in content['items']:
                data.append(item)
                
        current_start = current_end
    
    # Adjust delta
    if content['total_count'] < 500 or content['total_count'] > 1000:
        ratio = RATIO_MULTIPLIER * 1000.0 / int(content['total_count'])
        current_delta = datetime.timedelta(seconds = int(current_delta.total_seconds() * ratio))
        print 'ratio =', ratio, 'new delta is', current_delta


2015-01-01T00:00:00 2015-01-08T00:00:00 7 days, 0:00:00 with 537
2015-01-08T00:00:00 2015-01-15T00:00:00 7 days, 0:00:00 with 943
2015-01-15T00:00:00 2015-01-22T00:00:00 7 days, 0:00:00 with 1145
ratio = 0.655021834061 new delta is 4 days, 14:02:37
2015-01-15T00:00:00 2015-01-19T14:02:37 4 days, 14:02:37 with 631
2015-01-19T14:02:37 2015-01-24T04:05:14 4 days, 14:02:37 with 1221
ratio = 0.614250614251 new delta is 2 days, 19:35:39
2015-01-19T14:02:37 2015-01-22T09:38:16 2 days, 19:35:39 with 601
Using mgoeminne's API key
2015-01-22T09:38:16 2015-01-25T05:13:55 2 days, 19:35:39 with 1200
ratio = 0.625 new delta is 1 day, 18:14:46
2015-01-22T09:38:16 2015-01-24T03:53:02 1 day, 18:14:46 with 616
2015-01-24T03:53:02 2015-01-25T22:07:48 1 day, 18:14:46 with 1622
ratio = 0.462392108508 new delta is 19:32:03
2015-01-24T03:53:02 2015-01-24T23:25:05 19:32:03 with 415
ratio = 1.80722891566 new delta is 1 day, 11:18:09
2015-01-24T23:25:05 2015-01-26T10:43:14 1 day, 11:18:09 with 1804
ratio = 0.415742793792 new delta is 14:40:36
2015-01-24T23:25:05 2015-01-25T14:05:41 14:40:36 with 449
ratio = 1.67037861915 new delta is 1 day, 0:30:56
2015-01-25T14:05:41 2015-01-26T14:36:37 1 day, 0:30:56 with 1380
ratio = 0.54347826087 new delta is 13:19:25
2015-01-25T14:05:41 2015-01-26T03:25:06 13:19:25 with 1299
ratio = 0.577367205543 new delta is 7:41:33
2015-01-25T14:05:41 2015-01-25T21:47:14 7:41:33 with 707
Using narjisse-tabout's API key
2015-01-25T21:47:14 2015-01-26T05:28:47 7:41:33 with 617
2015-01-26T05:28:47 2015-01-26T13:10:20 7:41:33 with 48
ratio = 15.625 new delta is 5 days, 0:11:43
2015-01-26T13:10:20 2015-01-31T13:22:03 5 days, 0:11:43 with 691
2015-01-31T13:22:03 2015-02-05T13:33:46 5 days, 0:11:43 with 641
2015-02-05T13:33:46 2015-02-10T13:45:29 5 days, 0:11:43 with 787
Using tommens's API key
2015-02-10T13:45:29 2015-02-15T13:57:12 5 days, 0:11:43 with 720
2015-02-15T13:57:12 2015-02-20T14:08:55 5 days, 0:11:43 with 1173
ratio = 0.639386189258 new delta is 3 days, 4:51:04
2015-02-15T13:57:12 2015-02-18T18:48:16 3 days, 4:51:04 with 667
2015-02-18T18:48:16 2015-02-21T23:39:20 3 days, 4:51:04 with 1151
ratio = 0.651607298002 new delta is 2 days, 2:04:36
2015-02-18T18:48:16 2015-02-20T20:52:52 2 days, 2:04:36 with 650
2015-02-20T20:52:52 2015-02-22T22:57:28 2 days, 2:04:36 with 1740
ratio = 0.431034482759 new delta is 21:35:05
2015-02-20T20:52:52 2015-02-21T18:27:57 21:35:05 with 348
Using AlexandreDecan's API key
ratio = 2.15517241379 new delta is 1 day, 22:31:07
2015-02-21T18:27:57 2015-02-23T16:59:04 1 day, 22:31:07 with 1889
ratio = 0.397035468502 new delta is 18:28:10
2015-02-21T18:27:57 2015-02-22T12:56:07 18:28:10 with 536
2015-02-22T12:56:07 2015-02-23T07:24:17 18:28:10 with 1280
ratio = 0.5859375 new delta is 10:49:18
2015-02-22T12:56:07 2015-02-22T23:45:25 10:49:18 with 991
2015-02-22T23:45:25 2015-02-23T10:34:43 10:49:18 with 314
ratio = 2.38853503185 new delta is 1 day, 1:50:52
2015-02-23T10:34:43 2015-02-24T12:25:35 1 day, 1:50:52 with 160
ratio = 4.6875 new delta is 5 days, 1:09:41
2015-02-24T12:25:35 2015-03-01T13:35:16 5 days, 1:09:41 with 570
Using mgoeminne's API key
2015-03-01T13:35:16 2015-03-06T14:44:57 5 days, 1:09:41 with 677
2015-03-06T14:44:57 2015-03-11T15:54:38 5 days, 1:09:41 with 800
2015-03-11T15:54:38 2015-03-16T17:04:19 5 days, 1:09:41 with 774
2015-03-16T17:04:19 2015-03-21T18:14:00 5 days, 1:09:41 with 1407
ratio = 0.533049040512 new delta is 2 days, 16:35:05
2015-03-16T17:04:19 2015-03-19T09:39:24 2 days, 16:35:05 with 632
Using narjisse-tabout's API key
2015-03-19T09:39:24 2015-03-22T02:14:29 2 days, 16:35:05 with 983
2015-03-22T02:14:29 2015-03-24T18:49:34 2 days, 16:35:05 with 1731
ratio = 0.433275563258 new delta is 1 day, 3:58:58
2015-03-22T02:14:29 2015-03-23T06:13:27 1 day, 3:58:58 with 1456
ratio = 0.51510989011 new delta is 14:24:51
2015-03-22T02:14:29 2015-03-22T16:39:20 14:24:51 with 525
2015-03-22T16:39:20 2015-03-23T07:04:11 14:24:51 with 939
Using tommens's API key
2015-03-23T07:04:11 2015-03-23T21:29:02 14:24:51 with 136
ratio = 5.51470588235 new delta is 3 days, 7:29:23
2015-03-23T21:29:02 2015-03-27T04:58:25 3 days, 7:29:23 with 481
ratio = 1.55925155925 new delta is 5 days, 3:56:40
2015-03-27T04:58:25 2015-04-01T08:55:05 5 days, 3:56:40 with 830
2015-04-01T08:55:05 2015-04-06T12:51:45 5 days, 3:56:40 with 527
2015-04-06T12:51:45 2015-04-11T16:48:25 5 days, 3:56:40 with 845
Using AlexandreDecan's API key
2015-04-11T16:48:25 2015-04-16T20:45:05 5 days, 3:56:40 with 1201
ratio = 0.624479600333 new delta is 3 days, 5:24:02
2015-04-11T16:48:25 2015-04-14T22:12:27 3 days, 5:24:02 with 606
2015-04-14T22:12:27 2015-04-18T03:36:29 3 days, 5:24:02 with 1079
ratio = 0.695088044486 new delta is 2 days, 5:48:00
2015-04-14T22:12:27 2015-04-17T04:00:27 2 days, 5:48:00 with 854
2015-04-17T04:00:27 2015-04-19T09:48:27 2 days, 5:48:00 with 369
ratio = 2.0325203252 new delta is 4 days, 13:20:58
2015-04-19T09:48:27 2015-04-23T23:09:25 4 days, 13:20:58 with 1204
ratio = 0.62292358804 new delta is 2 days, 20:06:58
2015-04-19T09:48:27 2015-04-22T05:55:25 2 days, 20:06:58 with 637
Using mgoeminne's API key
2015-04-22T05:55:25 2015-04-25T02:02:23 2 days, 20:06:58 with 1010
ratio = 0.742574257426 new delta is 2 days, 2:34:52
2015-04-22T05:55:25 2015-04-24T08:30:17 2 days, 2:34:52 with 683
2015-04-24T08:30:17 2015-04-26T11:05:09 2 days, 2:34:52 with 1035
ratio = 0.724637681159 new delta is 1 day, 12:39:10
2015-04-24T08:30:17 2015-04-25T21:09:27 1 day, 12:39:10 with 656
2015-04-25T21:09:27 2015-04-27T09:48:37 1 day, 12:39:10 with 1691
ratio = 0.443524541691 new delta is 16:15:23
2015-04-25T21:09:27 2015-04-26T13:24:50 16:15:23 with 464
ratio = 1.61637931034 new delta is 1 day, 2:16:35
2015-04-26T13:24:50 2015-04-27T15:41:25 1 day, 2:16:35 with 1292
ratio = 0.580495356037 new delta is 15:15:11
2015-04-26T13:24:50 2015-04-27T04:40:01 15:15:11 with 1186
ratio = 0.632377740304 new delta is 9:38:44
2015-04-26T13:24:50 2015-04-26T23:03:34 9:38:44 with 969
Using narjisse-tabout's API key
2015-04-26T23:03:34 2015-04-27T08:42:18 9:38:44 with 249
ratio = 3.01204819277 new delta is 1 day, 5:03:10
2015-04-27T08:42:18 2015-04-28T13:45:28 1 day, 5:03:10 with 264
ratio = 2.84090909091 new delta is 3 days, 10:32:10
2015-04-28T13:45:28 2015-05-02T00:17:38 3 days, 10:32:10 with 624
2015-05-02T00:17:38 2015-05-05T10:49:48 3 days, 10:32:10 with 486
ratio = 1.54320987654 new delta is 5 days, 7:22:13
2015-05-05T10:49:48 2015-05-10T18:12:01 5 days, 7:22:13 with 1084
ratio = 0.691881918819 new delta is 3 days, 16:07:30
2015-05-05T10:49:48 2015-05-09T02:57:18 3 days, 16:07:30 with 817
Using tommens's API key
2015-05-09T02:57:18 2015-05-12T19:04:48 3 days, 16:07:30 with 804
2015-05-12T19:04:48 2015-05-16T11:12:18 3 days, 16:07:30 with 908
2015-05-16T11:12:18 2015-05-20T03:19:48 3 days, 16:07:30 with 980
Using AlexandreDecan's API key
2015-05-20T03:19:48 2015-05-23T19:27:18 3 days, 16:07:30 with 1759
ratio = 0.426378624218 new delta is 1 day, 13:34:28
2015-05-20T03:19:48 2015-05-21T16:54:16 1 day, 13:34:28 with 708
2015-05-21T16:54:16 2015-05-23T06:28:44 1 day, 13:34:28 with 775
2015-05-23T06:28:44 2015-05-24T20:03:12 1 day, 13:34:28 with 1208
ratio = 0.620860927152 new delta is 23:19:42
2015-05-23T06:28:44 2015-05-24T05:48:26 23:19:42 with 523
2015-05-24T05:48:26 2015-05-25T05:08:08 23:19:42 with 1242
ratio = 0.6038647343 new delta is 14:05:13
2015-05-24T05:48:26 2015-05-24T19:53:39 14:05:13 with 662
Using mgoeminne's API key
2015-05-24T19:53:39 2015-05-25T09:58:52 14:05:13 with 634
2015-05-25T09:58:52 2015-05-26T00:04:05 14:05:13 with 179
ratio = 4.18994413408 new delta is 2 days, 11:01:24
2015-05-26T00:04:05 2015-05-28T11:05:29 2 days, 11:01:24 with 848
2015-05-28T11:05:29 2015-05-30T22:06:53 2 days, 11:01:24 with 1032
ratio = 0.726744186047 new delta is 1 day, 18:53:41
2015-05-28T11:05:29 2015-05-30T05:59:10 1 day, 18:53:41 with 850
Using narjisse-tabout's API key
2015-05-30T05:59:10 2015-06-01T00:00:00 1 day, 18:53:41 with 543

In [6]:
data_json = json.dumps(data, indent=0)
f_out = gzip.open(OUTPUT, 'wb')
f_out.write(data_json)
f_out.close()